### © Aaron Gilkison and Maciej Kurzynski
### Vectors of Violence: Legitimation and Distribution of State Power in the ''People’s Liberation Army Daily'' (Jiefangjun Bao)
### The Journal of Cultural Analytics

import requests
import re
import time
import os
import json
import spacy
import numpy as np
from multiprocessing import Pool
from bs4 import BeautifulSoup
from datetime import datetime

scraped_dir = "scraped_jfjb"
if not os.path.exists(scraped_dir):
   os.makedirs(scraped_dir)

year_url = "https://dlib.eastview.com/browse/issues/2323/YEAR/udb/570"
issue_url = "https://dlib.eastview.com/browse/issue/ISSUE/udb/570"
article_url = "https://dlib.eastview.com/browse/doc/DOC/"

year_start = 1956
year_end = 1989

years = [year for year in range(year_start, year_end+1)]

sleep_year, sleep_issue, sleep_article = 3, 3, 3

articles_per_page = 20

year_issues_dict = {}
for index, year in enumerate(years):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Getting issue links for year {year}...")
    current_year_URL = re.sub('YEAR', str(year), year_url)
    page = requests.get(current_year_URL)
    soup = BeautifulSoup(page.text, 'html.parser')
    #get all links to issues
    issue_links = soup.find_all("a", {'href': re.compile(r'/browse/issue/[0-9]{5,}/udb/570')})
    #put them in the year dictionary, each key is a year,
    #and each value is a list of tuples (issue_id, issue_date)
    year_issues_dict[year] = [(int(issue_a["href"].split("/")[3]), issue_a.contents[0].strip().split()[-1]) for issue_a in issue_links]
    time.sleep(sleep_year)

issue_articles_dict = {}
# an example filename for a single article: 1990_11-24_743340_西北风进行首次反导试验_14344652.txt
# which is year_issue[1]_issue[0]_title_articleID.txt
for year in year_issues_dict.keys():
    print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking existing files...")
    filenames = os.listdir(scraped_dir)
    filenames = [fn for fn in filenames if fn.startswith(str(year))]
    downloaded_articles_ids = [int(fn.split("_")[-1].split(".")[0]) for fn in filenames]
    months_scraped = [int(fn.split("_")[1].split("-")[0]) for fn in filenames]
    if len(filenames) > 0:
        last_scraped_month = min(months_scraped) # start from the last scraped month in this year if there is something downloaded already
    else:
        last_scraped_month = 13 # start from scratch for this year if there is nothing downloaded 
    for index, issue in enumerate(year_issues_dict[year]): # issue is (issue_id, issue_date)
        if int(str(issue[1]).split("-")[0]) > last_scraped_month: # start from the last scraped month; if 13, then start from 1
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Issue {year}:{issue[1]} already exists, skipping...")
            continue
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Scraping issue: {year}:{issue[0]}, {issue[1]}")
        current_issue_URL = re.sub('ISSUE', str(issue[0]), issue_url)
        page = requests.get(current_issue_URL)
        soup = BeautifulSoup(page.text, 'html.parser')
        pages_n = 1
        pagination = soup.find_all(class_="Pagination")
        if len(pagination) > 0:
            pages_n = len(pagination[0].find_all("li"))-1
        html_content = str(soup)
        #get all links to articles
        article_links = soup.find_all("a", {'href': re.compile(r"/browse/doc/[0-9]{5,}")})
        time.sleep(sleep_issue)
        if pages_n > 1: # if there is more than one subpage for this issue
            for page_n in range(1, pages_n):
                curr_offset = articles_per_page * page_n
                current_issue_URL = f"https://dlib.eastview.com/browse/issue/{issue}/udb/570?issue={issue}&Direction=asc&pager.offset={curr_offset}"
                page = requests.get(current_issue_URL)
                soup = BeautifulSoup(page.text, 'html.parser')
                #add those additional links to articles from the subpage
                article_links.extend(soup.find_all("a", {'href': re.compile(r"/browse/doc/[0-9]{5,}")}))
                time.sleep(sleep_issue)
        #put all links to articles in the issue dictionary,
        #each key is an issue id, and each value is a list of tuples (article_id, article_title)
        articles = [(int(article_a["href"].split("/")[3]), re.sub(r'[^\w\s]', '', article_a.contents[0].strip())) for article_a in article_links]
        issue_articles_dict[issue[0]] = articles

        #start downloading all articles in the dictionary for the given issue
        for article in issue_articles_dict[issue[0]]:
            if article[0] in downloaded_articles_ids:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] File {year}_{issue[1]}_{issue[0]}_{article[1]}_{article[0]}.txt already exists, skipping...")
                continue
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Scraping article:", year, issue[1], article[0], article[1])
            current_article_URL = re.sub('DOC', str(article[0]), article_url)
            page = requests.get(current_article_URL)
            soup = BeautifulSoup(page.text, 'html.parser')
            try:
                text = soup.find_all('div', class_="ArticleText")[0].get_text().split("Article Title")[0] # get everything before Article Title
                text = re.sub(r'[a-zA-Z]','',text).strip() # basic pre-processing: removing non-Chinese text
                with open(f"{scraped_dir}/{year}_{issue[1]}_{issue[0]}_{article[1]}_{article[0]}.txt", "w") as f:
                    f.write(text)
            except:
                print(f"[{datetime.now().strftime('%H:%M:%S')}] There was a problem with article: {year}_{issue[1]}_{issue[0]}_{article[1]}_{article[0]}.txt")
                with open(f"failed_articles.txt", "a") as f:
                    f.write(f"{year},{issue[0]},{issue[1]},{article[0]},{article[1]}\n")
            time.sleep(sleep_article)

# dumping all this into a single json
filenames = [fn for fn in os.listdir(scraped_dir) if not fn.startswith(".")]

articles = []
for index, filename in enumerate(tqdm(filenames)):
    with open(scraped_dir + "/" + filename) as f:
        text = f.read()
    filename = filename.split("_")
    issue_date = filename[0] + "-" + filename[1]
    issue_id = int(filename[2])
    article_title = filename[3]
    article_id = int(filename[4].split(".")[0])
    articles.append({"article_id": article_id, "issue_date" : issue_date, "issue_id": issue_id, "title": article_title, "text": text})
    
filename_JFJB = 'JFJB_1956-1989.json'

with open(filename_JFJB, "w") as outfile:
    json.dump(articles, outfile, indent=2, ensure_ascii=False)


# splitting the texts into words (which adds a new field "text_split" to each dictionary item)
    
nlp = spacy.load("zh_core_web_lg")

number_of_processes = 4

# Process each chunk separately and split Chinese texts into words
def split_list_of_chunks(list_of_chunks):
    results = []
    for index, chunk in enumerate(list_of_chunks):
        doc = [token.text for token in nlp(chunk)]
        results.append(doc)
    return results

with open(filename_JFJB, "r") as infile:
    JFJB = json.load(infile)

outname = filename_JFJB.replace('.json', '-split.json')

chunks = [article["text"] for _, article in enumerate(JFJB)]

chunks_split = np.array_split(chunks, number_of_processes)

print(f"There are {len(chunks_split)} sublists.")
with Pool(number_of_processes) as p:
    print("Starting multiprocessing...")
    results_all = p.map(split_list_of_chunks, chunks_split)
    results = []
    for result in results_all:
        results.extend(result)
    print(len(results))
    for index, result in enumerate(results):
        res = " ".join(result)
        result = re.sub("[“”、.。,，;；:：\[【\]】（）()!！?？a-zA-Z0-9]", "", res)
        result = " ".join([word.strip() for word in result.split() if len(word.strip()) > 0])
        JFJB[index]["split_text"] = result

# Write the word-split output to a .json file
with open(outname, 'w', encoding='utf8') as out:
    json.dump(JFJB, out, ensure_ascii=False, indent=4)


# splitting into sentences
    
def process_chunk(chunk):
    processed_chunk = []
    for item in chunk:
        doc = nlp(item['text'])
        processed_sentences = []
        for sent in doc.sents:
            words = [token.text for token in sent if not token.is_punct and token.text.strip() != '']
            sentence_str = ' '.join(words).replace('\n', ' ').strip()
            # Ensuring only one space between words
            sentence_str = ' '.join(sentence_str.split())
            if len(sentence_str) > 0:
                processed_sentences.append(sentence_str)
        item['sentences'] = processed_sentences
        processed_chunk.append(item)
    return processed_chunk

chunks_split = np.array_split(JFJB, number_of_processes)

with Pool(number_of_processes) as pool:
    results = pool.map(process_chunk, chunks_split)
processed_data = [item for chunk in results for item in chunk]

with open('JFJB_1956-1989_split_sent.json', 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=4) # this is the final data we use for the analysis